full_data <- readRDS('data/full_data.rds')

Compute Daily Max H2S

H2S_dm <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
  mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 501 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1215: `day = 2020-07-12`, `Monitor = "First Methodist"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run ]8;;ide:run:dplyr::last_dplyr_warnings()dplyr::last_dplyr_warnings()]8;; to see the 500 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
H2S_ma <- H2S_dm %>%
  mutate(yearmonth = format(day, "%Y-%m")) %>%
  group_by(yearmonth, Monitor) %>%
  summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
H2S_dm_graph <- H2S_dm %>%
 ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Max H2S concentration by monitor", y = 'Daily Max H2S', x = 'time') +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph) %>% layout(dragmode = 'pan')
H2S_dm_graph_b <- H2S_dm %>%
 ggplot(aes(x=day, y=H2S_daily_max, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Daily Max H2S concentration by monitor w/o outlier", y = 'Daily Max H2S', x = 'time') +
   scale_y_continuous(limits = c(0, 100)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_dm_graph_b) %>% layout(dragmode = 'pan')
H2S_ma_graph <- H2S_ma %>%
 ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Monthly Average H2S concentration by monitor", y = 'Monthly Average H2S', x = 'time') +
   scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_ma_graph) %>% layout(dragmode = 'pan')
H2S_ma_graph_b <- H2S_ma %>%
 ggplot(aes(x=yearmonth, y=H2S_monthly_average, group=Monitor, color=Monitor)) +
   geom_line() +
   labs(title = "Monthly Average H2S concentration by monitor w/o outlier", y = 'Monthly Average H2S', x = 'time') +
   scale_x_discrete(breaks = unique(H2S_ma$yearmonth)[seq(1, length(unique(H2S_ma$yearmonth)), by = 6)]) +
  scale_y_continuous(limits=c(0, 50)) +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(H2S_ma_graph_b) %>% layout(dragmode = 'pan')

The line with the largest deviation/peak corresponds to the 213th&Chico monitor, in 2021-10

full_data %>%
  polarPlot(pollutant = "H2S", col = "turbo", 
            key.position = "bottom",
            key.header = "mean H2S", 
            key.footer = NULL, title = 'hi')

Explore some GAM models

h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        9.091e-01  1.084e+00   0.839   0.4016    
## year2021                           3.076e-01  1.082e+00   0.284   0.7763    
## year2022                          -7.302e-01  1.083e+00  -0.675   0.5000    
## wd_secQ2                           8.995e-02  4.755e-02   1.892   0.0585 .  
## wd_secQ3                           8.260e-01  5.171e-02  15.974  < 2e-16 ***
## wd_secQ4                           4.469e-01  4.417e-02  10.118  < 2e-16 ***
## ws                                -9.682e-02  5.944e-03 -16.290  < 2e-16 ***
## I(1/MinDist^2)                     2.222e+04  3.730e+04   0.596   0.5514    
## RefineryMarathon (Carson)          2.989e+00  5.907e-02  50.603  < 2e-16 ***
## RefineryMarathon (Wilmington)      8.388e-01  6.456e-02  12.993  < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -5.341e-01  5.964e-02  -8.955  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)   3.880e-01  6.332e-02   6.127 8.94e-10 ***
## RefineryTorrance Refinery          8.476e-02  6.389e-02   1.327   0.1846    
## RefineryValero Refinery            5.988e-01  6.286e-02   9.526  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F p-value    
## s(as.numeric(month)) 7.996      8 881.4  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.00848   Deviance explained = 0.849%
## GCV = 336.53  Scale est. = 336.52    n = 1730935
plot(h2s_model_a)

h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+
                   Refinery+s(latitude, bs = 'tp')+s(longitude, bs='tp'), 
                   data = full_data)
summary(h2s_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws + 
##     I(1/MinDist^2) + Refinery + s(latitude, bs = "tp") + s(longitude, 
##     bs = "tp")
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -1.421e+02  1.395e+01 -10.184  < 2e-16 ***
## year2021                           6.814e-01  1.066e+00   0.639   0.5227    
## year2022                           2.109e-01  1.066e+00   0.198   0.8432    
## wd_secQ2                           2.133e-01  4.684e-02   4.553 5.28e-06 ***
## wd_secQ3                           5.200e-01  5.096e-02  10.204  < 2e-16 ***
## wd_secQ4                           5.159e-01  4.353e-02  11.852  < 2e-16 ***
## ws                                -1.081e-01  5.855e-03 -18.465  < 2e-16 ***
## I(1/MinDist^2)                     6.171e+06  1.672e+05  36.917  < 2e-16 ***
## RefineryMarathon (Carson)          1.090e+02  1.552e+01   7.021 2.21e-12 ***
## RefineryMarathon (Wilmington)      1.434e+02  1.604e+01   8.938  < 2e-16 ***
## RefineryPhillips 66 (Wilimington)  1.913e+02  1.550e+01  12.339  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)   2.008e+02  1.586e+01  12.662  < 2e-16 ***
## RefineryTorrance Refinery          2.543e+01  1.177e+01   2.161   0.0307 *  
## RefineryValero Refinery            1.878e+02  1.642e+01  11.434  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df        F p-value    
## s(as.numeric(month)) 7.999      8 1019.368  <2e-16 ***
## s(latitude)          1.000      1 2698.973  <2e-16 ***
## s(longitude)         1.000      1    6.084  0.0156 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0381   Deviance explained = 3.82%
## GCV = 326.46  Scale est. = 326.45    n = 1730935
plot(h2s_model_b)

h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_dm_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_sec + 
##     ws + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        5.024e+00  5.659e+00   0.888  0.37466    
## year2021                           1.330e+00  5.651e+00   0.235  0.81393    
## year2022                          -8.367e+00  5.652e+00  -1.480  0.13878    
## wd_secQ2                          -1.341e+00  2.445e-01  -5.484 4.16e-08 ***
## wd_secQ3                           3.643e+00  2.663e-01  13.679  < 2e-16 ***
## wd_secQ4                          -2.626e-01  2.275e-01  -1.154  0.24832    
## ws                                 4.156e-02  3.054e-02   1.361  0.17364    
## I(1/MinDist^2)                     9.242e+04  1.926e+05   0.480  0.63123    
## RefineryMarathon (Carson)          2.569e+01  3.040e-01  84.515  < 2e-16 ***
## RefineryMarathon (Wilmington)      4.007e+00  3.327e-01  12.043  < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -8.985e-01  3.075e-01  -2.922  0.00348 ** 
## RefineryPhillips 66 (Wilmington)   3.154e+00  3.267e-01   9.656  < 2e-16 ***
## RefineryTorrance Refinery          8.734e-01  3.292e-01   2.653  0.00799 ** 
## RefineryValero Refinery            5.897e+00  3.242e-01  18.185  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.998      8 2750  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0229   Deviance explained =  2.3%
## GCV = 9173.7  Scale est. = 9173.6    n = 1777112
plot(h2s_dm_model_a)

h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude, bs = 'tp')+s(longitude, bs='tp'), data = full_data)
summary(h2s_dm_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_sec + 
##     ws + I(1/MinDist^2) + Refinery + s(latitude, bs = "tp") + 
##     s(longitude, bs = "tp")
## 
## Parametric coefficients:
##                                     Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                       -1.278e+03  7.783e+00 -164.226  < 2e-16 ***
## year2021                           4.754e+00  5.471e+00    0.869 0.384956    
## year2022                          -7.489e-02  5.472e+00   -0.014 0.989080    
## wd_secQ2                          -2.293e-01  2.404e-01   -0.954 0.340185    
## wd_secQ3                           9.515e-01  2.615e-01    3.638 0.000275 ***
## wd_secQ4                           2.505e-01  2.234e-01    1.121 0.262101    
## ws                                -4.294e-02  3.005e-02   -1.429 0.152934    
## I(1/MinDist^2)                     5.494e+07  4.597e+05  119.509  < 2e-16 ***
## RefineryMarathon (Carson)          9.825e+02  5.680e+00  172.977  < 2e-16 ***
## RefineryMarathon (Wilmington)      1.287e+03  5.952e+00  216.151  < 2e-16 ***
## RefineryPhillips 66 (Wilimington)  1.718e+03  6.956e+00  246.937  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)   1.799e+03  7.270e+00  247.406  < 2e-16 ***
## RefineryTorrance Refinery          2.327e+02  2.939e+00   79.186  < 2e-16 ***
## RefineryValero Refinery            1.687e+03  7.051e+00  239.284  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F p-value    
## s(as.numeric(month)) 7.999      8  3609  <2e-16 ***
## s(latitude)          1.000      1 69775  <2e-16 ***
## s(longitude)         1.000      1 25305  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.108   Deviance explained = 10.8%
## GCV =   8598  Scale est. = 8597.9    n = 1730935
plot(h2s_dm_model_b)

h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        4.562e+00  4.624e+00   0.987 0.323830    
## year2021                           1.076e+00  4.618e+00   0.233 0.815796    
## year2022                          -8.169e+00  4.618e+00  -1.769 0.076930 .  
## wd_secQ2                          -7.359e-01  1.981e-01  -3.714 0.000204 ***
## wd_secQ3                           3.097e+00  2.157e-01  14.360  < 2e-16 ***
## wd_secQ4                          -7.004e-01  1.846e-01  -3.794 0.000148 ***
## ws                                 1.185e-01  2.461e-02   4.815 1.47e-06 ***
## I(1/MinDist^2)                     2.964e+04  1.569e+05   0.189 0.850144    
## RefineryMarathon (Carson)          2.395e+01  2.453e-01  97.616  < 2e-16 ***
## RefineryMarathon (Wilmington)      4.224e+00  2.706e-01  15.613  < 2e-16 ***
## RefineryPhillips 66 (Wilimington) -2.615e-02  2.499e-01  -0.105 0.916656    
## RefineryPhillips 66 (Wilmington)   3.524e+00  2.658e-01  13.262  < 2e-16 ***
## RefineryTorrance Refinery          1.570e+00  2.636e-01   5.956 2.58e-09 ***
## RefineryValero Refinery            6.305e+00  2.632e-01  23.960  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df    F p-value    
## s(as.numeric(month)) 7.999      8 3996  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0307   Deviance explained = 3.07%
## GCV = 6125.1  Scale est. = 6125      n = 1814982
plot(h2s_ma_model_a)

h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude, bs = 'tp')+s(longitude, bs='tp'), data = full_data)
summary(h2s_ma_model_b)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year + 
##     wd_sec + ws + I(1/MinDist^2) + Refinery + s(latitude, bs = "tp") + 
##     s(longitude, bs = "tp")
## 
## Parametric coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -1.223e+03  3.807e+01 -32.123  < 2e-16 ***
## year2021                           4.242e+00  4.442e+00   0.955 0.339588    
## year2022                          -5.266e-01  4.442e+00  -0.119 0.905645    
## wd_secQ2                           4.182e-01  1.951e-01   2.143 0.032118 *  
## wd_secQ3                           7.243e-01  2.123e-01   3.411 0.000647 ***
## wd_secQ4                          -1.549e-01  1.814e-01  -0.854 0.393162    
## ws                                 4.626e-02  2.439e-02   1.896 0.057917 .  
## I(1/MinDist^2)                     5.254e+07  4.773e+05 110.085  < 2e-16 ***
## RefineryMarathon (Carson)          9.403e+02  4.217e+01  22.300  < 2e-16 ***
## RefineryMarathon (Wilmington)      1.231e+03  4.360e+01  28.243  < 2e-16 ***
## RefineryPhillips 66 (Wilimington)  1.644e+03  4.218e+01  38.982  < 2e-16 ***
## RefineryPhillips 66 (Wilmington)   1.722e+03  4.316e+01  39.891  < 2e-16 ***
## RefineryTorrance Refinery          2.226e+02  3.196e+01   6.967 3.23e-12 ***
## RefineryValero Refinery            1.615e+03  4.467e+01  36.149  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                      edf Ref.df        F p-value    
## s(as.numeric(month))   8      8  5187.30  <2e-16 ***
## s(latitude)            1      1 24696.07  <2e-16 ***
## s(longitude)           1      1    55.74  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.144   Deviance explained = 14.4%
## GCV =   5667  Scale est. = 5666.9    n = 1730935
plot(h2s_ma_model_b)